rm(list=ls())
library(dplyr)
library(haven)
library(fst)

## Find everyone in 2017

bef <- 
  read.fst("bef2017.fst") 

## keep pretreatment variables 

sibling_data <- 
  bef %>% 
  select(c("PNR", "MOR_ID", "FAR_ID", "FAMILIE_ID", "ALDER",
           "IE_TYPE", "GENERATION", "KOEN", "STATSB")) %>% 
  mutate(data_year     = 2017,
         year_of_birth = data_year - (ALDER + 1)) 

## Find everyone in data 
## Loop over years backwards in time
## append for each year with everyone not in later data

years <- 2016:1986

for(k in  years ){
  #load population data as of Jan 1
  year <- k
  
  # find same variables for observations that are not in later data. 
  bef <- 
    read.fst(paste("bef", year, ".fst", sep = "")) %>% 
    select(c("PNR", "MOR_ID", "FAR_ID", "FAMILIE_ID", "ALDER",
             "IE_TYPE", "GENERATION", "KOEN", "STATSB")) %>%
    # keep observations not in later data
    filter(!PNR %in% sibling_data$PNR) %>% 
    mutate(data_year     = year,
           year_of_birth = year - (ALDER + 1)) 
  
  # append with data that are not in later years
  sibling_data <- 
    bind_rows(sibling_data, bef)
  
  # print year and number of added observatios
  print(k)
  print(nrow(bef))
}

## group data by mother's and father's id
mom_dad_data <- 
  sibling_data %>% 
  filter(! MOR_ID == "") %>% 
  filter(! FAR_ID == "") %>%
  group_by(MOR_ID, FAR_ID) %>%
  ## count no. of full siblings per mother-father dyad
  ## count no. of girls per mother-father dyad
  summarise(n_sibship  = n(),
            n_girls    = sum(KOEN == 2)) 

## create fixed effect per sibling group
mom_dad_data <- 
  mom_dad_data %>% 
  ungroup() %>%
  mutate(fe_sibship = 1:nrow(.))

## merge no. of girls, no. of siblings, and siblings fixed effecs 
## on individual 
sibling_data <- 
  left_join(sibling_data, mom_dad_data,
            by = c("MOR_ID", "FAR_ID"))

## keep only ppl with at least one sibling
full_sib <- 
  sibling_data %>% 
  filter(! is.na(fe_sibship))

## write function for determining birth order within siblings
birth_order_func <- 
  function(year_of_birth){
    birth_order <- rep(NA, length(year_of_birth))
    for(i in 1:length(year_of_birth)){
      birth_order[i] <- i
      if (i > 1){
        if(year_of_birth[i] == year_of_birth[i - 1]){
          birth_order[i] <- birth_order[i-1]
        }
      }
    } 
    return(birth_order)
  }

## group by sibling group, sort by YOB, and decide birth order
full_sib <- 
  full_sib %>% 
  group_by(fe_sibship) %>% 
  arrange(year_of_birth) %>%
  mutate(birth_order = birth_order_func(year_of_birth))

## Identify twins, triplets, etc.
full_sib <- 
  full_sib %>% 
  group_by(fe_sibship, birth_order) %>% 
  mutate(twin = length(year_of_birth))


# Find everyone ever running for local gov---------------------------------------

# Loop over election years
years <- seq(1993, 2013, by = 4)

for (k in 1:length(years)){
  year <- years[k]
  
  # Load election data for years[k]
  # Select relevant variables
  # recode VALGT_JN to 1/0
  # create empty run variable (useful for naming variables in next chunk)
  candidates_data <-
    read_sas(paste("kv", year, "_recodes_pnr_afid.sas7bdat", 
                   sep = "")) %>% 
    select(c("PNR", "VALGT_JN", "VALGTYPE", "PARTI")) %>% 
    mutate(VALGT_JN = VALGT_JN == "J",
           run_kv   = NA)
  
  # merge on election data 
  suppressWarnings({
    full_sib <-
      left_join(full_sib, candidates_data, by = "PNR") %>% 
      mutate(run_kv      = !is.na(VALGT_JN))
    
  })
  names(full_sib)[tail(1:ncol(full_sib), 4)] <-
    paste(names(candidates_data), "_", year, sep = "")[-1]
  
  print(k)
}


full_sib <- 
  full_sib %>% 
  mutate(kv_n_elected = sum(c(VALGT_JN_1993, VALGT_JN_1997,
                              VALGT_JN_2001, VALGT_JN_2005,
                              VALGT_JN_2009, VALGT_JN_2013), 
                            na.rm = TRUE),
         kv_ever_elected = kv_n_elected > 0,
         kv_n_run = sum(c(run_kv_1993, run_kv_1997,
                          run_kv_2001, run_kv_2005,
                          run_kv_2009, run_kv_2013)),
         kv_ever_run = kv_n_run > 0,
         elected_kv_1993 = rowSums(as.matrix(VALGT_JN_1993), na.rm = TRUE),
         elected_kv_1997 = rowSums(as.matrix(VALGT_JN_1997), na.rm = TRUE),
         elected_kv_2001 = rowSums(as.matrix(VALGT_JN_2001), na.rm = TRUE),
         elected_kv_2005 = rowSums(as.matrix(VALGT_JN_2005), na.rm = TRUE),
         elected_kv_2009 = rowSums(as.matrix(VALGT_JN_2009), na.rm = TRUE),
         elected_kv_2013 = rowSums(as.matrix(VALGT_JN_2013), na.rm = TRUE)) 


# create variable for whether ppl are in the election year

for(k in 1:length(years)){
  year <- years[k]
  bef <- 
    read.fst(paste("bef", year, ".fst", sep = "")) %>% 
    select("PNR") %>% 
    mutate(in_year = 1)
  
  full_sib <- 
    left_join(full_sib, bef, by = "PNR") 
  
  full_sib <- 
    full_sib%>% 
    mutate(in_year = rowSums(as.matrix(in_year), na.rm = TRUE))
  
  names(full_sib)[tail(1:ncol(full_sib), 1)] <-
    paste(names(full_sib)[tail(1:ncol(full_sib), 1)], "_", year, sep = "")
  print(k)
}    


# Find everyone ever running for parliament -------------------------------

# Loop over election years
years <- c(1990, 1994, 1998, 2001, 2005, 2007, 2011, 2015)

for (k in 1:length(years)){
  year <- years[k]
  
  # Load election data for years[k]
  # Select relevant variables
  # recode VALGT_JN to 1/0
  # create empty run variable (useful for naming variables in next chunk)
  candidates_data <-
    read_sas(paste("fv", year, "_recodes_pnr_afid.sas7bdat", 
                   sep = "")) %>% 
    select(c("PNR", "VALGT_JN", "VALGTYPE", "PARTI")) %>% 
    mutate(VALGT_JN = VALGT_JN == "J",
           run_fv   = NA)
  
  # merge on election data 
  suppressWarnings({
    full_sib <-
      left_join(full_sib, candidates_data, by = "PNR") %>% 
      mutate(run_fv      = !is.na(VALGT_JN))
    
  })
  names(full_sib)[tail(1:ncol(full_sib), 4)] <-
    paste(names(candidates_data), "_", year, "_FV", sep = "")[-1]
  
  print(k)
}


full_sib <- 
  full_sib %>% 
  mutate(fv_n_elected = sum(c(VALGT_JN_1990_FV, VALGT_JN_1994_FV, VALGT_JN_1998_FV,
                              VALGT_JN_2001_FV, VALGT_JN_2005_FV, VALGT_JN_2007_FV,
                              VALGT_JN_2011_FV, VALGT_JN_2015_FV), 
                            na.rm = TRUE),
         fv_ever_elected = fv_n_elected > 0,
         fv_n_run = sum(c(run_fv_1990_FV, run_fv_1994_FV, run_fv_1998_FV, 
                          run_fv_2001_FV, run_fv_2005_FV, run_fv_2007_FV,
                          run_fv_2011_FV, run_fv_2015_FV)),
         fv_ever_run = fv_n_run > 0,
         elected_fv_1990 = rowSums(as.matrix(VALGT_JN_1990_FV), na.rm = TRUE),
         elected_fv_1994 = rowSums(as.matrix(VALGT_JN_1994_FV), na.rm = TRUE),
         elected_fv_1998 = rowSums(as.matrix(VALGT_JN_1998_FV), na.rm = TRUE),
         elected_fv_2001 = rowSums(as.matrix(VALGT_JN_2001_FV), na.rm = TRUE),
         elected_fv_2005 = rowSums(as.matrix(VALGT_JN_2005_FV), na.rm = TRUE),
         elected_fv_2007 = rowSums(as.matrix(VALGT_JN_2007_FV), na.rm = TRUE),
         elected_fv_2011 = rowSums(as.matrix(VALGT_JN_2011_FV), na.rm = TRUE),
         elected_fv_2015 = rowSums(as.matrix(VALGT_JN_2015_FV), na.rm = TRUE)) 


# create variable for whether ppl are in the election year

for(k in 1:length(years)){
  year <- years[k]
  bef <- 
    read.fst(paste("bef", year, ".fst", sep = "")) %>% 
    select("PNR") %>% 
    mutate(in_year_fv = 1)
  
  full_sib <- 
    left_join(full_sib, bef, by = "PNR") 
  
  full_sib <- 
    full_sib%>% 
    mutate(in_year_fv = rowSums(as.matrix(in_year_fv), na.rm = TRUE))
  
  names(full_sib)[tail(1:ncol(full_sib), 1)] <-
    paste(names(full_sib)[tail(1:ncol(full_sib), 1)], "_", year, sep = "")
  print(k)
}    


# Subset to first borns ---------------------------------------------------

second_born <- 
  full_sib %>% 
  ungroup() %>%
  filter(birth_order == 2 & twin == 1) %>% 
  select(c("fe_sibship", "year_of_birth", "KOEN")) %>% 
  rename(year_of_birth_second = year_of_birth,
         female_second = KOEN)

first_born <- 
  full_sib %>% 
  ungroup() %>%
  filter(birth_order == 1 & twin == 1) %>% 
  left_join(., second_born, by = "fe_sibship") %>% 
  filter(! is.na(year_of_birth_second))

save(first_born, file = "firstborn.rdata")
save(full_sib, file = "all_sibs.rdata")
